Bad weather Kelheim Demo

Oleksandr Soboliev

Research Meteostat

After some researches about meteostat data nearest station in DE that belongs to Kelheim region are:

there are many of them so I am starting to think about extracting all from the Bayern or extract the nearest from longtitude/latitude point with the Kelheim shapefile(using json and Euclid distances)

Kelheim has no weather station, but it could be reconstructed with 2 other

Hohenfels with id: “10775” and Ingolstadt with id:“10860” kelheim_data = {weight1}x{hohenfels} + {weight2}x{inglstadt}

Also this site shows, that there are many of the Kelheim stations in this area, but meteostat doesn’t contain them https://www.wunderground.com/dashboard/pws/IKELHE5

Research Weatherstack

weatherstack_kelheim = read_delim("data/Kelheim_weather_since_july_2008.csv",delim = ",")
## Rows: 120312 Columns: 6
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): description
## dbl  (4): hour, precip, visibility, totalsnow_daily
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(weatherstack_kelheim)
## # A tibble: 120,312 × 6
##    date        hour description precip visibility totalsnow_daily
##    <date>     <dbl> <chr>        <dbl>      <dbl>           <dbl>
##  1 2008-07-01     0 Clear            0         10               0
##  2 2008-07-01   100 Clear            0         10               0
##  3 2008-07-01   200 Clear            0         10               0
##  4 2008-07-01   300 Clear            0         10               0
##  5 2008-07-01   400 Clear            0         10               0
##  6 2008-07-01   500 Clear            0         10               0
##  7 2008-07-01   600 Sunny            0         10               0
##  8 2008-07-01   700 Sunny            0         10               0
##  9 2008-07-01   800 Sunny            0         10               0
## 10 2008-07-01   900 Sunny            0         10               0
## # … with 120,302 more rows
## # ℹ Use `print(n = ...)` to see more rows

What to take as a reffer point isn’t clear because of the date(before/after covid) and weather type (sunny,clear,temperature) Also there is no temperature in it :/

Import mobility from Google

#global_mobility = read_delim("https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv",",")
#de_mobility = global_mobility %>% filter(country_region_code == "DE")
#print(unique(de_mobility$sub_region_1))

As we can see the most precise region to filter data from is Bavaria :/

Relevant data for the , mobility

#bavaria_mobility = de_mobility %>% filter(sub_region_1 == "Bavaria")
#bavaria_mobility = bavaria_mobility %>% #select(country_region,sub_region_1,date,residential_percent_change_from_baseline) %>%
#  mutate(residential_percent_change_from_baseline = -residential_percent_change_from_baseline,
#         source = "Google")%>%
#  rename(BundeslandID = sub_region_1,not_at_home_change = residential_percent_change_from_baseline)
#bavaria_mobility = bavaria_mobility %>% select(date,BundeslandID,not_at_home_change,source)
#Need to filter out weekends

#plt = ggplot(bavaria_mobility)+
#  geom_point(aes(x = date,y = not_at_home_change))
#ggplotly(plt)

Import mobility from Senozon

snz_mobility = read_delim("data/LK_mobilityData_weekdays.csv",";")
## Rows: 50652 Columns: 4
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (1): Landkreis
## dbl (3): date, outOfHomeDuration, percentageChangeComparedToBeforeCorona
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
snz_mobility = snz_mobility %>% filter(Landkreis == "Landkreis Kelheim") %>% mutate(source = "senozon") %>% select(-outOfHomeDuration) %>% rename(not_at_home_change = percentageChangeComparedToBeforeCorona)
snz_mobility$date = as.Date(strptime(snz_mobility$date,"%Y%m%d"))
plt = ggplot(snz_mobility)+
  geom_point(aes(x = date,y = not_at_home_change))
ggplotly(plt)

Aggregate 2 sources

weatherstack_kelheim_daily = weatherstack_kelheim %>%
  group_by(date) %>%
  summarize(precip_day = sum(precip),visibility_mean = mean(visibility),totalsnow_daily = mean(totalsnow_daily))
  
weatherstack_kelheim_weekly = weatherstack_kelheim_daily %>% 
  mutate(year_week = paste0(isoyear(date),"-",isoweek(date))) %>%
  group_by(year_week) %>%
  summarize(date = first(date), precip_week = sum(precip_day),visibility_mean = mean(visibility_mean),totalsnow_weekly =sum( totalsnow_daily))
weatherstack_kelheim_weekly = unique(weatherstack_kelheim_weekly)
print(weatherstack_kelheim_weekly)
## # A tibble: 717 × 5
##    year_week date       precip_week visibility_mean totalsnow_weekly
##    <chr>     <date>           <dbl>           <dbl>            <dbl>
##  1 2008-27   2008-07-01         7.7            9.22                0
##  2 2008-28   2008-07-07        53.7            8.20                0
##  3 2008-29   2008-07-14        14              8.46                0
##  4 2008-30   2008-07-21         4              9.07                0
##  5 2008-31   2008-07-28         9              9.64                0
##  6 2008-32   2008-08-04        13.4            9.52                0
##  7 2008-33   2008-08-11        33              8.55                0
##  8 2008-34   2008-08-18        29.8            9.46                0
##  9 2008-35   2008-08-25         2.5            9.39                0
## 10 2008-36   2008-09-01        26.6            8.10                0
## # … with 707 more rows
## # ℹ Use `print(n = ...)` to see more rows
#mob_joined = rbind(snz_mobility,bavaria_mobility)
snz_mobility_year_week = snz_mobility %>% 
  mutate(year_week = paste0(isoyear(date),"-",isoweek(date))) %>%
  group_by(year_week) %>%
  summarize(date = first(date),not_at_home_change = mean(not_at_home_change))
mob_joined_with_weather = snz_mobility_year_week %>% inner_join(weatherstack_kelheim_weekly, by = "year_week") %>% select(-date.y) %>% rename(date = date.x)
print(mob_joined_with_weather)
## # A tibble: 107 × 6
##    year_week date       not_at_home_change precip_week visibility_mean totalsnow_weekly
##    <chr>     <date>                  <dbl>       <dbl>           <dbl>            <dbl>
##  1 2020-10   2020-03-06                  0        27.2            8.63              1.4
##  2 2020-11   2020-03-13                  0        15.5            8.70              0  
##  3 2020-12   2020-03-20                -14        13.3            8.75              7.3
##  4 2020-13   2020-03-27                -23         0             10                 0  
##  5 2020-14   2020-04-03                -20         0.5            9.99              0  
##  6 2020-15   2020-04-10                -18         0.2            9.99              0  
##  7 2020-16   2020-04-17                -19         8.8            9.88              0  
##  8 2020-17   2020-04-24                -17         0             10                 0  
##  9 2020-18   2020-05-01                -13        20.4            8.73              0  
## 10 2020-19   2020-05-08                 -9         8.4            9.76              0  
## # … with 97 more rows
## # ℹ Use `print(n = ...)` to see more rows
#First plot with colour as precipitation
plt_color = ggplot(mob_joined_with_weather)+
  geom_point(aes(x = date,y = not_at_home_change,colour = precip_week))+
  scale_color_gradient2()
  

ggplotly(plt_color)
#Second plot as another line as precipitation
plt_line = ggplot(mob_joined_with_weather)+
  geom_point(aes(x = date,y = not_at_home_change))+
  geom_line(aes(x = date,y = precip_week*0.5,color = "red"))
  

ggplotly(plt_line)
plt_hist_precip = ggplot(mob_joined_with_weather,aes(x = precip_week,y = not_at_home_change))+
  stat_summary_bin(fun = "mean",
                   geom = "bar",
                   binwidth = 2,fill = "blue")

ggplotly(plt_hist_precip)
plt_hist_visibility = ggplot(mob_joined_with_weather,aes(x = visibility_mean,y = not_at_home_change))+
  stat_summary_bin(fun = "mean",
                   geom = "bar",
                   binwidth = 0.5,fill = "blue")

ggplotly(plt_hist_visibility)
plt_hist_totalsnow = ggplot(mob_joined_with_weather,aes(x = totalsnow_weekly,y = not_at_home_change))+
  stat_summary_bin(fun = "mean",
                   geom = "bar",
                   binwidth = 7,fill = "blue")

ggplotly(plt_hist_totalsnow)

Let’s try it out with meteostat data, that contains scope of the 2022 without restictions

Ingolstadt data from id = 10860 station

ingolstadt_weather = read_delim("https://bulk.meteostat.net/v2/daily/10860.csv.gz",",",col_names = FALSE)
## Rows: 19279 Columns: 11
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (9): X2, X3, X4, X5, X6, X7, X8, X9, X10
## lgl  (1): X11
## date (1): X1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(ingolstadt_weather) = c("date", "tavg", "tmin", "tmax", "prcp", "snow", "wdir", "wspd", "wpgt", "pres", "tsun")


# We don't need data of weather before 2020, because of snz_mobility date, also data isn't precise

ingolstadt_weather = ingolstadt_weather %>% filter(year(date)>=2020)%>% replace_na(list(snow = 0))

print(ingolstadt_weather)
## # A tibble: 961 × 11
##    date        tavg  tmin  tmax  prcp  snow  wdir  wspd  wpgt  pres tsun 
##    <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
##  1 2020-01-01  -0.8  -3.5   3.4   0       0    70   6.5  20.5 1036. NA   
##  2 2020-01-02  -2.9  -6.1  -1.2   0       0   243   4    16.6 1033. NA   
##  3 2020-01-03   1.2  -2.4   5.9   0.3     0   218   7.2  29.5 1028  NA   
##  4 2020-01-04   4.4   3.4   6.2   0.5     0   278  20.5  48.2 1030  NA   
##  5 2020-01-05   2.8  -1.8   4.5   0       0   271   9    38.9 1037. NA   
##  6 2020-01-06  -1.2  -3.7   4.5   0       0   119   4.3  14.8 1032. NA   
##  7 2020-01-07  -0.4  -5     2.6   0.9     0   233   5    24.1 1032. NA   
##  8 2020-01-08   1    -2.3   3.2   0       0   100   2.9  13   1032. NA   
##  9 2020-01-09   4.3  -0.1  10.8   0       0   109   2.9  11.2 1024. NA   
## 10 2020-01-10   4.9  -1.2  11.2   0.2     0   230   9.4  37.1 1023  NA   
## # … with 951 more rows
## # ℹ Use `print(n = ...)` to see more rows

Hohenfels data from id = 10775 station

hohenfels_weather = read_delim("https://bulk.meteostat.net/v2/daily/10775.csv.gz",",",col_names = FALSE)
## Rows: 6304 Columns: 11
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (8): X2, X3, X4, X5, X6, X7, X8, X10
## lgl  (2): X9, X11
## date (1): X1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(hohenfels_weather) = c("date", "tavg", "tmin", "tmax", "prcp", "snow", "wdir", "wspd", "wpgt", "pres", "tsun")


# We don't need data of weather before 2020, because of snz_mobility date, also data isn't precise

hohenfels_weather = hohenfels_weather %>% filter(year(date)>=2020) %>% replace_na(list(snow = 0))

print(hohenfels_weather)
## # A tibble: 928 × 11
##    date        tavg  tmin  tmax  prcp  snow  wdir  wspd wpgt   pres tsun 
##    <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl> <lgl>
##  1 2020-01-01  -4.3  -8.2   1.5    NA     0    NA  NA   NA      NA  NA   
##  2 2020-01-02  -4.5  -9.7  -0.7    NA     0   355   0.6 NA    1033  NA   
##  3 2020-01-03   0.5  -2.5   4.5    NA     0   153   3.7 NA    1028. NA   
##  4 2020-01-04   3.4   2.2   5.7    NA     0   264  15.5 NA    1028. NA   
##  5 2020-01-05   1.3  -2.1   3.3    NA     0   334   3.7 NA    1036. NA   
##  6 2020-01-06  -1.1  -3.6   2.5    NA     0    18   2.1 NA    1032. NA   
##  7 2020-01-07   0.2  -3.4   3      NA     0   332   2.6 NA    1031. NA   
##  8 2020-01-08   1.5   0.2   2.6    NA     0    84   3.4 NA      NA  NA   
##  9 2020-01-09   4.1   1.5   9.3    NA     0    35   2.3 NA    1024. NA   
## 10 2020-01-10   5.1   1.3   9.4    NA     0   323   3.8 NA    1022. NA   
## # … with 918 more rows
## # ℹ Use `print(n = ...)` to see more rows

As we can see in Hohenfels data isn’t that accurate and precipitation is data is missing fr year 2020, so for the further analysis we take only Ingolstadt data.

ingolstadt_weather_weekly = ingolstadt_weather %>% 
  mutate(year_week = paste0(isoyear(date),"-",isoweek(date))) %>%
  group_by(year_week) %>%
  summarize(date = first(date), prcp_week = sum(prcp), tavg= mean(tavg),snow_week =sum( snow),wspd = mean(wspd),tmax = max(tmax)) %>%
  arrange(year_week)
#ingolstadt_weather_weekly = unique(weatherstack_kelheim_weekly)
print(ingolstadt_weather_weekly)
## # A tibble: 138 × 7
##    year_week date       prcp_week  tavg snow_week  wspd  tmax
##    <chr>     <date>         <dbl> <dbl>     <dbl> <dbl> <dbl>
##  1 2020-1    2020-01-01       0.8  0.94         0  9.44   6.2
##  2 2020-10   2020-03-02      13.9  4.14         0 13.2    9.6
##  3 2020-11   2020-03-09      13.1  7.14         0 15.4   17.7
##  4 2020-12   2020-03-16       9.9  7.16         0  9.21  19  
##  5 2020-13   2020-03-23       0    4.44         0 14.2   NA  
##  6 2020-14   2020-03-30       0    4.61         0  7.86  17.3
##  7 2020-15   2020-04-06       0   12.2          0  5.23  22.4
##  8 2020-16   2020-04-13       3.8 10.9          0  8.23  23.6
##  9 2020-17   2020-04-20       0   12.9          0 14.7   21  
## 10 2020-18   2020-04-27      18.6 11.3          0 10.3   20.1
## # … with 128 more rows
## # ℹ Use `print(n = ...)` to see more rows
mob_joined_with_ingolstadt = ingolstadt_weather_weekly %>% 
  inner_join(snz_mobility_year_week, by = "year_week") %>% 
  select(-date.x) %>%
  rename(date = date.y) %>%
  replace_na(list(tmax = 0))

print(mob_joined_with_ingolstadt)
## # A tibble: 126 × 8
##    year_week prcp_week  tavg snow_week  wspd  tmax date       not_at_home_change
##    <chr>         <dbl> <dbl>     <dbl> <dbl> <dbl> <date>                  <dbl>
##  1 2020-10        13.9  4.14         0 13.2    9.6 2020-03-06                  0
##  2 2020-11        13.1  7.14         0 15.4   17.7 2020-03-13                  0
##  3 2020-12         9.9  7.16         0  9.21  19   2020-03-20                -14
##  4 2020-13         0    4.44         0 14.2    0   2020-03-27                -23
##  5 2020-14         0    4.61         0  7.86  17.3 2020-04-03                -20
##  6 2020-15         0   12.2          0  5.23  22.4 2020-04-10                -18
##  7 2020-16         3.8 10.9          0  8.23  23.6 2020-04-17                -19
##  8 2020-17         0   12.9          0 14.7   21   2020-04-24                -17
##  9 2020-18        18.6 11.3          0 10.3   20.1 2020-05-01                -13
## 10 2020-19         2.3 13.3          0  6.73  25.8 2020-05-08                 -9
## # … with 116 more rows
## # ℹ Use `print(n = ...)` to see more rows
#First plot with colour as precipitation
plt_ing_color = ggplot(mob_joined_with_ingolstadt)+
  geom_point(aes(x = date,y = not_at_home_change,colour = prcp_week))+
  scale_color_gradient2()

ggplotly(plt_ing_color)
plt_ing_color = ggplot(mob_joined_with_ingolstadt)+
  geom_point(aes(x = date,y = not_at_home_change))+
  geom_line(aes(x = date,y = prcp_week,color = "red"))

ggplotly(plt_ing_color)
plt_hist_precip_ing = ggplot(mob_joined_with_ingolstadt,aes(x = prcp_week,y = not_at_home_change))+
  stat_summary_bin(fun = "mean",
                   geom = "bar",
                   binwidth = 2,fill = "blue")

ggplotly(plt_hist_precip_ing)
plt_hist_precip_ing = ggplot(mob_joined_with_ingolstadt,aes(x = tavg,y = not_at_home_change))+
  stat_summary_bin(fun = "mean",
                   geom = "bar",
                   binwidth = 2,fill = "blue")

ggplotly(plt_hist_precip_ing)
plt_hist_precip_ing = ggplot(mob_joined_with_ingolstadt,aes(x = tmax,y = not_at_home_change))+
  stat_summary_bin(fun = "mean",
                   geom = "bar",
                   binwidth = 2,fill = "blue")

ggplotly(plt_hist_precip_ing)
plt_hist_precip_ing = ggplot(mob_joined_with_ingolstadt,aes(x = snow_week,y = not_at_home_change))+
  stat_summary_bin(fun = "mean",
                   geom = "bar",
                   binwidth = 5,fill = "blue")

ggplotly(plt_hist_precip_ing)

After first look at data, we can assume that hours out of home strongly depend on average temperature outside, that sounds logical. Mb categorization seasons of the data will help to understand this function

mob_joined_with_ingolstadt = mob_joined_with_ingolstadt %>% 
  mutate(season = ifelse(month(date) %in% c(12,1,2),"winter",NA)) %>%
  mutate(season = ifelse(month(date) %in% c(3,4,5),"spring",season)) %>%
  mutate(season = ifelse(month(date) %in% c(6,7,8),"summer",season)) %>%
  mutate(season = ifelse(month(date) %in% c(9,10,11),"autumn",season))
plt_hist_season = ggplot(mob_joined_with_ingolstadt,aes(x = season,y = not_at_home_change))+
  stat_summary_bin(fun = "mean",
                   geom = "bar",
                   binwidth = 5,fill = "blue")

ggplotly(plt_hist_season)